Projet KikiCkisenVa - Prédiction

data_train <- read.csv2("spreadsheets/data_train.csv", sep = ",")
data_train <- na.omit(data_train)
data_train <- fact.data(data_train)
dim(data_train)
## [1] 784  32
head(data_train)
data_test <- read.csv2("spreadsheets/data_test.csv", sep = ",")
data_test <- na.omit(data_test)
data_test <- fact.data(data_test)
dim(data_test)
## [1] 332  31
head(data_test)
data_train_num <- data_train[, unlist(lapply(data_train, is.numeric))]
data_train_num[16] <- data_train["Attrition"]
dim(data_train_num)
## [1] 784  16
head(data_train_num)
data_test_num <- data_test[, unlist(lapply(data_test, is.numeric))]
dim(data_test_num)
## [1] 332  15
head(data_test_num)
library(FactoMineR)
data_train_log <- log(data_train_num[-16])
data_train_log[data_train_log == -Inf] <- 0
data_train_log <- t(scale(t(data_train_log)))
data_train_log <- as.data.frame(data_train_log)
data_train_log[16] <- data_train["Attrition"]
coord_data_train <- PCA(data_train_log, scale.unit = TRUE, graph = FALSE, quali.sup = 16)$ind$coord[,1:2]
plot(coord_data_train[,1], coord_data_train[,2], col = data_train$Attrition, xlab = "Axe 1", ylab = "Axe 2")
legend('topright', legend = levels(data_train$Attrition), col = 1:2, cex = 0.8, pch = 1)

data_test_log <- log(data_test_num)
data_test_log[data_test_log == -Inf] <- 0
data_test_log <- t(scale(t(data_test_log)))
data_test_log <- as.data.frame(data_test_log)
coord_data_test <- PCA(data_test_log, scale.unit = TRUE, graph = FALSE)$ind$coord[,1:2]
plot(coord_data_test[,1], coord_data_test[,2], xlab = "Axe 1", ylab = "Axe 2")

library(klaR)
partimat(coord_data_train, grouping = data_train_num$Attrition, method = "lda")

partimat(coord_data_train, grouping = data_train_num$Attrition, method = "qda")

res.kmeans <- kmeans(data_train_num[-16], centers = 2, nstart = 1000)
plot(coord_data_train, col = res.kmeans$cluster, pch = as.numeric(data_train$Attrition))

plot(table(res.kmeans$cluster, data_train$Attrition))

## Modèle
cah.ward <- hclust(dist(data_train_num), method = "ward.D2")
## Selection de 2 cluster (choix binaire)
plot(cah.ward, hang = -1)
rect.hclust(cah.ward, 2)

res.cah <- cutree(cah.ward, 2)

plot(coord_data_train, col = res.cah, pch = as.numeric(data_train$Attrition))

plot(table(res.cah, data_train$Attrition))

res.qda = qda(data_train_num[-16], grouping = data_train_num$Attrition)
res.qda
## Call:
## qda(data_train_num[-16], grouping = data_train_num$Attrition)
## 
## Prior probabilities of groups:
##        No       Yes 
## 0.8354592 0.1645408 
## 
## Group means:
##          Age DailyRate DistanceFromHome EmployeeNumber HourlyRate MonthlyIncome
## No  38.77099  792.5939         9.503817       1023.669   66.86260      7162.046
## Yes 34.42636  756.1938        10.449612       1039.922   67.96899      4947.279
##     MonthlyRate NumCompaniesWorked PercentSalaryHike TotalWorkingYears
## No     14124.12           2.708397          15.32672         12.670229
## Yes    14534.25           3.038760          15.16279          8.387597
##     TrainingTimesLastYear YearsAtCompany YearsInCurrentRole
## No               2.781679       7.767939           4.687023
## Yes              2.604651       5.240310           2.798450
##     YearsSinceLastPromotion YearsWithCurrManager
## No                 2.343511             4.465649
## Yes                1.837209             2.821705
pred.qda = predict(res.qda, data_train_num[-16])$class
table(data_train_num$Attrition, pred.qda)
##      pred.qda
##        No Yes
##   No  553 102
##   Yes  59  70

Sur les Yes prédits on a plus d’erreurs que de cas juste alors que ce n’est pas le cas avec les prédiction sur No.

library(DMwR)
table(data_train_num$Attrition)
## 
##  No Yes 
## 655 129
data_train_bal <- SMOTE(Attrition ~ ., data_train_num)
table(data_train_bal$Attrition)
## 
##  No Yes 
## 516 387
library(MASS)

## Modèle
res.lda <- lda(data_train_bal[-16], grouping = data_train_bal$Attrition)
res.qda <- qda(data_train_bal[-16], grouping = data_train_bal$Attrition)
## Prédiction
pred.lda <- predict(res.lda, newdata = data_train_bal[-16])
pred.qda <- predict(res.qda, newdata = data_train_bal[-16])
## Table de confusion
conf.lda <- table(pred.lda$class, data_train_bal$Attrition)
accuracy.lda <- (conf.lda[1,1] + conf.lda[2,2]) / sum(conf.lda)
plot(conf.lda)

conf.qda <- table(pred.qda$class, data_train_bal$Attrition)
accuracy.qda <- (conf.qda[1,1] + conf.qda[2,2]) / sum(conf.qda)
plot(conf.qda)

## courbe ROC
library(pROC)
ROC.lda <- roc(data_train_bal$Attrition, pred.lda$posterior[,2])
ROC.qda <- roc(data_train_bal$Attrition, pred.qda$posterior[,2])
plot(ROC.lda, print.auc=TRUE, print.auc.y = 0.5, col = 1)
plot(ROC.qda, add = TRUE, print.auc=TRUE,  print.auc.y = 0.45, col = 2)
legend("bottomright", lwd = 1, col = 1:2, c("LDA", "QDA"))

library(klaR)

## Modèle
stepwise.lda = stepclass(data_train_bal[-16], grouping = data_train_bal$Attrition, method = "lda", direction = "backward")
## correctness rate: 0.66889;  starting variables (15): Age, DailyRate, DistanceFromHome, EmployeeNumber, HourlyRate, MonthlyIncome, MonthlyRate, NumCompaniesWorked, PercentSalaryHike, TotalWorkingYears, TrainingTimesLastYear, YearsAtCompany, YearsInCurrentRole, YearsSinceLastPromotion, YearsWithCurrManager 
## correctness rate: 0.68332;  out: "DailyRate";  variables (14): Age, DistanceFromHome, EmployeeNumber, HourlyRate, MonthlyIncome, MonthlyRate, NumCompaniesWorked, PercentSalaryHike, TotalWorkingYears, TrainingTimesLastYear, YearsAtCompany, YearsInCurrentRole, YearsSinceLastPromotion, YearsWithCurrManager 
## correctness rate: 0.68885;  out: "HourlyRate";  variables (13): Age, DistanceFromHome, EmployeeNumber, MonthlyIncome, MonthlyRate, NumCompaniesWorked, PercentSalaryHike, TotalWorkingYears, TrainingTimesLastYear, YearsAtCompany, YearsInCurrentRole, YearsSinceLastPromotion, YearsWithCurrManager 
## correctness rate: 0.68996;  out: "MonthlyIncome";  variables (12): Age, DistanceFromHome, EmployeeNumber, MonthlyRate, NumCompaniesWorked, PercentSalaryHike, TotalWorkingYears, TrainingTimesLastYear, YearsAtCompany, YearsInCurrentRole, YearsSinceLastPromotion, YearsWithCurrManager 
## 
##  hr.elapsed min.elapsed sec.elapsed 
##       0.000       0.000       3.369
stepwise.lda
## method      : lda 
## final model : data_train_bal$Attrition ~ Age + DistanceFromHome + EmployeeNumber + 
##     MonthlyRate + NumCompaniesWorked + PercentSalaryHike + TotalWorkingYears + 
##     TrainingTimesLastYear + YearsAtCompany + YearsInCurrentRole + 
##     YearsSinceLastPromotion + YearsWithCurrManager
## <environment: 0x7fa2c0818438>
## 
## correctness rate = 0.69
res.stepwise.lda = lda(stepwise.lda$formula, data = data_train_bal[-16])
## Prédiction
pred.stepwise.lda <- predict(res.stepwise.lda, newdata = data_train_bal[-16])
## Table de confusion
conf.stepwise.lda <- table(pred.stepwise.lda$class, data_train_bal$Attrition)
accuracy.stepwise.lda <- (conf.stepwise.lda[1,1] + conf.stepwise.lda[2,2]) / sum(conf.stepwise.lda)
plot(conf.stepwise.lda)

## courbe ROC
ROC.stepwise.lda <- roc(data_train_bal$Attrition, pred.stepwise.lda$posterior[,2])
plot(ROC.stepwise.lda, print.auc=TRUE, print.auc.y = 0.5)
legend("bottomright", lwd = 1, col = 1, "LDA stepwise")

library(rpart)
library(rpart.plot)

## Modèle
arbre.cart = rpart(data_train_bal$Attrition ~ ., data = data_train_bal[-16], control = rpart.control(minsplit = 5, cp = 0))
plotcp(arbre.cart)

## Optimisation de l'arbre
cp.opt <- arbre.cart$cptable[which.min(arbre.cart$cptable[, "xerror"]), "CP"]
arbre.opt <- prune(arbre.cart, cp = cp.opt)
rpart.plot(arbre.opt, type=4, digits=2)
## Warning: labs do not fit even at cex 0.15, there may be some overplotting

## Prédiction
pred.cart.class <- predict(arbre.opt, newdata = data_train_bal[-16], type = "class")
pred.cart.prob <- predict(arbre.opt, newdata = data_train_bal[-16], type = "prob")[,2]
## Table de confusion
conf.cart <- table(pred.cart.class, data_train_bal$Attrition)
accuracy.cart <- (conf.cart[1,1] + conf.cart[2,2]) / sum(conf.cart)
plot(conf.cart)

## courbe ROC
ROC.cart <- roc(data_train_bal$Attrition, pred.cart.prob)
plot(ROC.cart, print.auc=TRUE, print.auc.y = 0.5, col = 1)
legend("bottomright", lwd = 1, col = 1, "CART")

library(randomForest)

## Modèle
res.RF <- randomForest(data_train_bal$Attrition ~ ., data_train_bal[-16])
res.RF
## 
## Call:
##  randomForest(formula = data_train_bal$Attrition ~ ., data = data_train_bal[-16]) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 10.63%
## Confusion matrix:
##      No Yes class.error
## No  487  29  0.05620155
## Yes  67 320  0.17312661
## Prédiction
pred.RF.class <- predict(res.RF, newdata = data_train_bal[-16], type="class")
pred.RF.prob <- predict(res.RF, newdata = data_train_bal[-16], type = "prob")[,2]
## Table de confusion
conf.RF <- table(pred.RF.class, data_train_bal$Attrition)
accuracy.RF <- (conf.RF[1,1] + conf.RF[2,2]) / sum(conf.RF)
plot(conf.RF)

## courbe ROC
ROC.RF <- roc(data_train_bal$Attrition, pred.RF.prob)
plot(ROC.RF, print.auc=TRUE, print.auc.y = 0.5, col = 1)
legend("bottomright", lwd = 1, col = 1, "Random Forest")

library(glmnet)

## Modèle
res.Lasso <- glmnet(as.matrix(data_train_bal[-16]), data_train_bal$Attrition, family='binomial')  
cv.Lasso <- cv.glmnet(as.matrix(data_train_bal[-16]), data_train_bal$Attrition, family="binomial", type.measure = "class") 
plot(cv.Lasso)

## Prédiction
pred.lasso.class <- predict(cv.Lasso, newx = as.matrix(data_train_bal[-16]), s = 'lambda.min', type = "class")
pred.lasso.prob <- predict(cv.Lasso, newx = as.matrix(data_train_bal[-16]), s = 'lambda.min', type = "response")[,1]
## Table de confusion
conf.lasso <- table(pred.lasso.class, data_train_bal$Attrition)
accuracy.lasso <- (conf.lasso[1,1] + conf.lasso[2,2]) / sum(conf.lasso)
plot(conf.lasso)

## courbe ROC
ROC.lasso <- roc(data_train_bal$Attrition, pred.lasso.prob)
plot(ROC.lasso, print.auc=TRUE, print.auc.y = 0.5, col = 1)
legend("bottomright", lwd = 1, col = 1, "Regression Logistique Lasso")

result = matrix(NA, ncol = 6, nrow = 2)
rownames(result) = c('accuracy', 'AUC')
colnames(result) = c('LDA', 'QDA',  'LDA stepwise', 'CART', 'Random Forest',  'Reg. Logi. Lasso')
result[1,] = c(accuracy.lda, accuracy.qda, accuracy.stepwise.lda, accuracy.cart, accuracy.RF, accuracy.lasso)
result[2,] = c(ROC.lda$auc, ROC.qda$auc, ROC.stepwise.lda$auc, ROC.cart$auc, ROC.RF$auc,  ROC.lasso$auc)
result
##                LDA       QDA LDA stepwise      CART Random Forest
## accuracy 0.6821705 0.7187154    0.6998893 0.9700997             1
## AUC      0.7496044 0.8135629    0.7447068 0.9906982             1
##          Reg. Logi. Lasso
## accuracy        0.6854928
## AUC             0.7508964
apply(result, 1, which.max )
## accuracy      AUC 
##        5        5
plot(ROC.lda, xlim = c(1,0))
plot(ROC.qda, add = TRUE, col = 2)
plot(ROC.stepwise.lda, add = TRUE, col = 3)
plot(ROC.cart, add = TRUE, col = 4)
plot(ROC.RF, add = TRUE, col = 5)
plot(ROC.lasso, add = TRUE, col = 6)
legend('bottomright', col = 1:6, paste(colnames(result)),  lwd = 1)

La meilleure méthode de prédiction en tout point est le random Forest.

pred.Attrition <- predict(res.RF, newdata = data_test_num, type="class")

plot(coord_data_test, col = pred.Attrition)

data_test_pred <- data.frame(pred.Attrition, data_test)
write.csv(data_test_pred, file = "prediction.csv", quote = FALSE, sep = ',')